!pip install collinearity
from collinearity import SelectNonCollinear
# Essentials
import numpy as np
import pandas as pd
import itertools
#import random
# Stats
from scipy import stats
from scipy.stats import skew, norm
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p
# Plots
import seaborn as sns
import matplotlib.pyplot as plt
# Preprocessing Libraries
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel, mutual_info_classif, f_classif, SelectKBest
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
# Machine Learning Libraries
from sklearn.feature_selection import f_regression
import statsmodels.api as sm
import sklearn
from sklearn import svm
import xgboost as xgb
from sklearn import tree
from sklearn.svm import SVC
from sklearn.metrics import roc_curve
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import recall_score, f1_score, roc_auc_score, log_loss, auc
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score, plot_confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.feature_selection import SelectFromModel, mutual_info_classif, f_classif, SelectKBest
from sklearn.naive_bayes import GaussianNB
from nltk.classify.scikitlearn import SklearnClassifier
# Mapper
from sklearn_pandas import DataFrameMapper
# Oversample
from imblearn.over_sampling import SMOTE
# Ignore useless warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import io
import os
# allow muliple output in one cell window
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# display more rows and columns
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999
from google.colab import drive
from pathlib import Path
main_path = '/content/drive/MyDrive/machine_learning/'
# change working directory
#print(os.chdir(main_path))
# get current directory
print(os.getcwd())
os.listdir()
drive.mount("/content/drive", force_remount=True)
df = pd.read_csv(main_path+'/data.csv')
df_bankrupt = df['Bankrupt?']
df_copy = df.copy()
df_bankrupt.head()
df_copy.head()
print(df.shape)
df.head()
df.info() # no missing value?
df.describe()
data_percent_missing = df.isnull().sum() * 100 / len(df)
data_missing_value = pd.DataFrame({'column_name': df.columns,'percent_missing': data_percent_missing})
data_missing_value = data_missing_value.sort_values('percent_missing',ascending=False)
data_missing_value['dtypes'] = df.dtypes
data_missing_value = data_missing_value[data_missing_value.percent_missing>0]
print(data_missing_value)
print(df['Bankrupt?'].value_counts())
sns.countplot(x='Bankrupt?', data=df);
Figure 1: Countplot of target variable
df.corr()['Bankrupt?'].sort_values(ascending=False)
Figure 2: Correlation with target variable
# check collinearity
df_train_num = df.select_dtypes(include=np.number)
print("Numerical:", df_train_num.shape)
plt.subplots(figsize=(38, 38))
sns.heatmap(df_train_num.corr(), annot =False, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')
Figure 3: Heatmap correlation matrix for collinearity
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 200))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, col in enumerate(list(df), 1):
plt.subplot(len(list(df)), 3, i)
sns.set_style("white")
x1 = df.loc[df['Bankrupt?']==1,[col]]
x2 = df.loc[df['Bankrupt?']==0,[col]]
# Plot
#kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
#plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="dodgerblue", label="Bankrupt")
sns.distplot(x2, color="orange", label="Not Bankrupt")
plt.ylim(0,None)
plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
plt.ylabel('Bankrupt?', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
#plt.xlabel(col)
plt.legend(loc='best', prop={'size': 10})
plt.show()
Figure 4: Distribution plot, split by target variable Y=0 and Y=1
pd.Series(index=['min','max'],data=[df.min(),df.max()]) # double check min, max
Figure 5: Transform and scale skewed variables
# log and max-min transform Independent Variabe
def data_trans(df):
for col in df:
skew = df[col].skew()
max = df[col].max()
if ((skew > 0.5 or skew < -0.5) and max >1):
print(col)
df[col] = np.log1p(df[col])
df[col] = (df[col]-(df[col]).min())/((df[col]).max()-(df[col]).min())
else:
continue
return df
df_norm = data_trans(df)
df_norm.describe()
Figure 6: Display max-min scaling
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 200))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, col in enumerate(list(df_norm), 1):
plt.subplot(len(list(df_norm)), 3, i)
sns.set_style("white")
x1 = df_norm.loc[df_norm['Bankrupt?']==1,[col]]
x2 = df_norm.loc[df_norm['Bankrupt?']==0,[col]]
# Plot
#kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
#plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="dodgerblue", label="Bankrupt")
sns.distplot(x2, color="orange", label="Not Bankrupt")
plt.ylim(0,None)
plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
plt.ylabel('Bankrupt?', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
#plt.xlabel(col)
plt.legend(loc='best', prop={'size': 10})
plt.show()
Figure 7: Distribution plot after log1p transformation and max-min scaling
X = df_norm.drop(['Bankrupt?'], axis=1)
y = df_norm[['Bankrupt?']]
print(X.shape)
print(y.shape)
Figure 8: Split the training set into an 80% training and 20% validation set
# First split the data set into an 90% raw and 10% test set
X_raw,X_test,y_raw,y_test = train_test_split(X, y,test_size=0.1,stratify = y, random_state = 321)
# Second pplit the raw data set into an 80% training and 20% validation set
X_train,X_val,y_train,y_val = train_test_split(X_raw,y_raw,test_size=0.2,stratify = y_raw,random_state = 321)
# See if both the train, validation and test label distribution are similarly distributed
print(X_train.shape, y_train.shape,X_val.shape, y_val.shape, X_test.shape, y_test.shape)
print('Train dataset Bankrupt:', round(y_train.value_counts()[1] / len(y_train) * 100,2) ,'%')
print('Train dataset Not Bankrupt:', round(y_train.value_counts()[0] / len(y_train) * 100,2) ,'%')
print('Validation dataset Bankrupt:', round(y_val.value_counts()[1] / len(y_val) * 100,2) ,'%')
print('Validation dataset Not Bankrupt:', round(y_val.value_counts()[0] / len(y_val) * 100,2) ,'%')
print('Test dataset Bankrupt:', round(y_test.value_counts()[1] / len(y_test) * 100,2) ,'%')
print('Test dataset Not Bankrupt:', round(y_test.value_counts()[0] / len(y_test) * 100,2) ,'%')
Figure 9: Imbalance the training dataset
print("Before OverSampling, counts of label '1': {}".format(sum(y_train.values==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train.values==0)))
smote = SMOTE(random_state=2)
X_train_oversampled, y_train_oversampled = smote.fit_sample(X_train, y_train)
X_train_oversampled = pd.DataFrame(X_train_oversampled, columns=X_train.columns)
y_train_oversampled = pd.DataFrame(y_train_oversampled, columns=y_train.columns)
print('After OverSampling, the shape of train_X: {}'.format(X_train_oversampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_oversampled.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_oversampled.values==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_oversampled.values==0)))
X_train_oversampled.head()
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#print("Normalized confusion matrix")
else:
1#print('Confusion matrix, without normalization')
#print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('Actual')
plt.xlabel('Predicted')
def model_results(notes,logreg, select_feature,X_train, y_train, X_val, y_val,display_OLS='Y'):
#print(notes)
#logreg = LogisticRegression()
model = logreg.fit(X_train[select_feature], y_train)
y_pred = logreg.predict(X_val[select_feature])
y_pred_proba = logreg.predict_proba(X_val[select_feature])[:, 1]
[fpr, tpr, thr] = roc_curve(y_val, y_pred_proba)
print("#"*25,notes,"#"*25)
print("Training Accuracy = {:.4f}".format(logreg.score(X_train[select_feature], y_train)))
print("Validation Accuracy = {:.4f}".format(logreg.score(X_val[select_feature], y_val)))
print("\nTraining ROC_AUC_score : %.4f" % (roc_auc_score(y_train, y_train)))
print("Validation ROC_AUC_score : %.4f" % (roc_auc_score(y_val, y_pred)))
print("\nTraining f1_score : %.6f" % (f1_score(y_train, y_train)))
print("Validation f1_score : %.6f" % (f1_score(y_val, y_pred)))
#Confusion Matrix
print("-"*15,"confusion_matrix","-"*15)
print(confusion_matrix(y_val, y_pred))
print('\n\n')
print("-"*15,"CLASSIFICATION REPORT","-"*15)
print(classification_report(y_val, y_pred, digits=4))
idx = np.min(np.where(tpr > 0.95)) # index of the first threshold for which the sensibility > 0.95
plt.figure()
plt.plot(fpr, tpr, color='coral', label='ROC curve (area = %0.4f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot([0,fpr[idx]], [tpr[idx],tpr[idx]], 'k--', color='blue')
plt.plot([fpr[idx],fpr[idx]], [0,tpr[idx]], 'k--', color='blue')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - specificity)', fontsize=14)
plt.ylabel('True Positive Rate (recall)', fontsize=14)
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
print("\n\nUsing a threshold of %.4f " % thr[idx] + "guarantees a sensitivity of %.4f " % tpr[idx] +
"and a specificity of %.4f" % (1-fpr[idx]) +
", i.e. a false positive rate of %.4f%%." % (np.array(fpr[idx])*100))
print('\n\n')
plt.figure()
#precision-recall curve
precision, recall, thresholds_pr = precision_recall_curve(y_val, y_pred_proba)
avg_pre = average_precision_score(y_val, y_pred_proba)
plt.plot(precision, recall, label = " average precision = {:0.4f}".format(avg_pre), lw = 3, alpha = 0.7)
plt.xlabel('Precision', fontsize = 14)
plt.ylabel('Recall', fontsize = 14)
plt.title('Precision-Recall Curve', fontsize = 18)
plt.legend(loc = 'best')
#find default threshold
close_default = np.argmin(np.abs(thresholds_pr - 0.5))
plt.plot(precision[close_default], recall[close_default], 'o', markersize = 8)
plt.show()
print('\n\n')
class_names = [0,1]
plt.figure()
plot_confusion_matrix(confusion_matrix(y_val, y_pred), classes=class_names, title='Confusion Matrix')
plt.show()
print('\n\n')
if display_OLS == 'Y':
logitmodel = sm.OLS(y_train, X_train[select_feature]).fit()
print(logitmodel.summary())
Figure 10: Junk Model with one variable
select_feature = [' Net Income to Total Assets']
logreg = LogisticRegression()
model_results('Model 1: Junk Model',logreg=logreg,select_feature=select_feature,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='Y')
X_norm = pd.concat([X_train,y_train], axis=1)
print(X_norm.head())
X_norm_corr = pd.DataFrame(X_norm.corr()['Bankrupt?'].sort_values(ascending=False).reset_index()).rename(columns={'index': 'col', 'Bankrupt?': 'corr'})
print(X_norm_corr.loc[(X_norm_corr['corr']).abs()>.1])
select_feature =list(set(X_norm_corr.loc[(X_norm_corr['corr']).abs()>.1]['col'].values) - set(['Bankrupt?']))
print(select_feature)
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif_(X, thresh=100):
cols = X.columns
variables = np.arange(X.shape[1])
dropped=True
while dropped:
dropped=False
c = X[cols[variables]].values
vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]
maxloc = vif.index(max(vif))
if max(vif) > thresh:
print('dropping \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
variables = np.delete(variables, maxloc)
dropped=True
print('Remaining variables:')
print(X.columns[variables])
return X[cols[variables]]
X_train2 = X_train[select_feature]
calculate_vif_(X_train2, thresh=100)
select_feature =[' Cash/Total Assets', ' Equity to Long-term Liability',
' Current Liability to Assets', ' Tax rate (A)',
' Total expense/Assets',
' Liability-Assets Flag', ' Net Value Per Share (B)', ' CFO to Assets',
' Cash/Current Liability',
' Operating Profit Per Share (Yuan ¥)',
' Current Liability to Current Assets',' Net Income to Total Assets']
select_feature2=[' ROA(B) before interest and depreciation after tax', ' CFO to Assets',
' Cash Flow to Total Assets',
' Interest Coverage Ratio (Interest expense to EBIT)',
' Current Liability to Current Assets',
' Equity to Long-term Liability', ' Debt ratio %',
' Working Capital to Total Assets', ' Cash/Current Liability']
# check collinearity
plt.subplots(figsize=(38, 38))
sns.heatmap(X_norm[select_feature].corr(), annot =True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')
logreg = LogisticRegression()
model_results('Model 2: Hand-picked Model',logreg=logreg,select_feature=select_feature2,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='Y')
#parameters = {'C': np.linspace(1, 1000, 1000)}
parameters = {"penalty": ['l2','l1','elasticnet'],'C': np.linspace(1, 1000, 1000),}
logreg = LogisticRegression()
#clf = GridSearchCV(logreg, parameters, cv=5, verbose=5, n_jobs=3)
rand_logreg = RandomizedSearchCV(logreg, parameters, n_iter=4,scoring='f1')
rand_logreg.fit(X_train_oversampled[select_feature], y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_logreg.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_logreg.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_logreg.best_params_)
Figure 11: Logistic regression model
logreg1 = LogisticRegression(C=503.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
model_results('Model 2: Hand-picked Model',logreg=logreg1,select_feature=select_feature,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='Y')
gnb_sm = GaussianNB()
gnb_params = {'var_smoothing': np.logspace(0,-2, num=1000)}
rand_gnb = RandomizedSearchCV(gnb_sm, gnb_params, n_iter=4,scoring='f1')
rand_gnb.fit(X_train_oversampled[select_feature2], y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_gnb.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_gnb.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_gnb.best_params_)
Figure 12: Naïve Bayes model
gaussian_nb = GaussianNB(priors=None,var_smoothing=0.05817880074344935)
model_results('Model 3: GaussianNB Model',logreg=gaussian_nb,select_feature=select_feature2,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='N')
# Grid Search
# Parameter Grid
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid','linear']}
# Make grid search classifier
#clf_grid = GridSearchCV(svm.SVC(), param_grid,refit=True,verbose=2)
rand_svc = RandomizedSearchCV(svm.SVC(), param_grid, n_iter=4,scoring='f1')
rand_svc.fit(X_train_oversampled[select_feature], y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_svc.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_svc.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_svc.best_params_)
Figure 13: SVM model
model_svm= svm.SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False)
model_results('Model 4: SVM Model',logreg=model_svm,select_feature=select_feature,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='N')
PCA
# Standardizing the features
npca = .99
#X_trainPCA = StandardScaler().fit_transform(X_train_oversampled)
pca = PCA(n_components=npca)
pca_train = pca.fit_transform(X_train_oversampled)#
pca_trainDF = pd.DataFrame(data = pca_train)
#X_valPCA = StandardScaler().fit_transform(X_val)
pca_val = pca.transform(X_val)
pca_valDF = pd.DataFrame(data = pca_val)
pca.explained_variance_ratio_
parameters = {"penalty": ['l2','l1','elasticnet'],'C': np.linspace(1, 1000, 1000),}
logregPCA = LogisticRegression().fit(pca_trainDF, y_train_oversampled)
rand_logregPCA = RandomizedSearchCV(logregPCA, parameters, n_iter=4,scoring='f1')
rand_logregPCA.fit(pca_trainDF, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_logregPCA.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_logregPCA.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_logregPCA.best_params_)
print(pca.explained_variance_ratio_)
select_featurePCA=pca_trainDF.columns
print(select_featurePCA)
logregPCA1 = LogisticRegression(C=944.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
model_results('Model 2.1: Logistic PCA Model',logreg=logregPCA1,select_feature=select_featurePCA,X_train=pca_trainDF, y_train=y_train_oversampled, X_val=pca_valDF, y_val=y_val,display_OLS='Y')
gnb_sm = GaussianNB()
gnb_params = {'var_smoothing': np.logspace(0,-2, num=1000)}
rand_gnbPCA = RandomizedSearchCV(gnb_sm, gnb_params, n_iter=10,scoring='f1')
rand_gnbPCA.fit(pca_trainDF, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_gnbPCA.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_gnbPCA.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_gnbPCA.best_params_)
gaussian_nbPCA = GaussianNB(priors=None, var_smoothing=0.013369837418249465)
model_results('Model 3.1: GaussianNB Model',logreg=gaussian_nbPCA,select_feature=select_featurePCA,X_train=pca_trainDF, y_train=y_train_oversampled, X_val=pca_valDF, y_val=y_val,display_OLS='N')
# Grid Search
# Parameter Grid
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid','linear']}
# Make grid search classifier
#clf_grid = GridSearchCV(svm.SVC(), param_grid,refit=True,verbose=2)
rand_svcPCA = RandomizedSearchCV(svm.SVC(), param_grid, n_iter=4,scoring='f1')
rand_svcPCA.fit(pca_trainDF, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_svcPCA.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_svcPCA.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_svcPCA.best_params_)
model_svmPCA= svm.SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
model_results('Model 4.1: SVM Model',logreg=model_svmPCA,select_feature=select_featurePCA,X_train=pca_trainDF, y_train=y_train_oversampled, X_val=pca_valDF, y_val=y_val,display_OLS='N')
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [10,20,40,80]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
criterion=['entropy','gini']
class_weight= ['balanced','balanced_subsample']
# Create the param grid
param_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap,
'criterion':criterion,
'class_weight' :class_weight}
print(param_grid)
model_rf = RandomForestClassifier()
rand_rf = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4,scoring='f1')
rand_rf.fit(X_train_oversampled, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_rf.best_params_)
Figure 14: Random Forest Classifier
select_feature3 = X_train.columns
model_rf= RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
criterion='entropy', max_depth=20, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=64,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
model_results('Model 5: RandomForestClassifier Model',logreg=model_rf,select_feature=select_feature3,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='N')
features = X_train.columns
importances = model_rf.feature_importances_
indices = np.argsort(importances)
# customized number
num_features = 10
plt.figure(figsize=(20,10))
plt.title('Feature Importances')
# only plot the customized number of features
plt.barh(range(num_features), importances[indices[-num_features:]], color='lightskyblue', align='center')
plt.yticks(range(num_features), [features[i] for i in indices[-num_features:]])
plt.xlabel('Relative Importance')
plt.show()
model_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(150, 1000),
'learning_rate': stats.uniform(0.01, 0.59),
'subsample': stats.uniform(0.3, 0.6),
'max_depth': [3, 4, 5, 6, 7, 8, 9],
'colsample_bytree': stats.uniform(0.5, 0.4),
'min_child_weight': [1, 2, 3, 4]
}
rand_xgb = RandomizedSearchCV(model_xgb,
param_distributions = param_dist,
cv =10,
n_iter = 5,
error_score = 0,
verbose = 3,
n_jobs = -1,scoring='f1')
rand_xgb.fit(X_train_oversampled, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_xgb.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_xgb.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_xgb.best_params_)
Figure 15: Gradient Boosted Trees
model_xgb= xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.8796922826172242, gamma=0,
learning_rate=0.21044046988031048, max_delta_step=0, max_depth=7,
min_child_weight=4, missing=None, n_estimators=643, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=0.6424182378732652, verbosity=1)
model_results('Model 6: XGBClassifier Model',logreg=model_xgb,select_feature=select_feature3,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='N')
param_dist = {'max_features' : ['sqrt','log2',0.5,0.6,0.7,0.8,0.9,1.0],
'min_samples_leaf' : [1, 2, 3, 7, 10, 11, 100],
'max_depth': [10,11,12,13,14],
'n_estimators': [50, 100,500],
'oob_score': [True, False]}
#rand_extratress = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True,random_state=7), param_distributions = param_dist, cv=3, n_iter = 15,random_state=7)
rand_extratress=RandomizedSearchCV(ExtraTreesClassifier(), param_dist, cv=10, scoring='f1')
rand_extratress.fit(X_train_oversampled, y_train_oversampled)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rand_extratress.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_extratress.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_extratress.best_params_)
Figure 16: Extra Trees
model_extraTrees=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=11, max_features=0.7,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=11, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=500,
n_jobs=None, oob_score=False, random_state=None, verbose=0,
warm_start=False)
model_results('Model 7: ExtraTreesRegressor Model',logreg=model_extraTrees,select_feature=select_feature3,X_train=X_train_oversampled, y_train=y_train_oversampled, X_val=X_val, y_val=y_val,display_OLS='N')